/**********************************************************************
  Copyright(c) 2019 Arm Corporation All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
      the documentation and/or other materials provided with the
      distribution.
    * Neither the name of Arm Corporation nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/
	.arch armv8-a+crypto
	.text
	.align	2
	.p2align 3,,7

/*
Macros
*/

.macro	declare_var_vector_reg name:req,reg:req
	\name\()_q	.req	q\reg
	\name\()_v	.req	v\reg
	\name\()_s	.req	s\reg
.endm
/**
maros for round 48-63
tmp0 : in
tmp1 : out
*/
.macro sha256_4_rounds_high msg:req,tmp0:req
	ldr		key_q , [tmp]
	mov		tmp0_v.16b,l0_\tmp0\()_v.16b
	mov		tmp1_v.16b,l1_\tmp0\()_v.16b
	add		l0_\tmp0\()_v.4s,l0_\msg\()_v.4s,key_v.4s
	add		l1_\tmp0\()_v.4s,l1_\msg\()_v.4s,key_v.4s
	mov		tmp2_v.16b,l0_abcd_v.16b
	mov		tmp3_v.16b,l1_abcd_v.16b
	sha256h		l0_abcd_q,l0_efgh_q,tmp0_v.4s
	sha256h		l1_abcd_q,l1_efgh_q,tmp1_v.4s
	sha256h2	l0_efgh_q,tmp2_q,tmp0_v.4s
	sha256h2	l1_efgh_q,tmp3_q,tmp1_v.4s

	ldr		key_q , [tmp]
	mov		tmp0_v.16b,l2_\tmp0\()_v.16b
	mov		tmp1_v.16b,l3_\tmp0\()_v.16b
	add		tmp,tmp,16
	add		l2_\tmp0\()_v.4s,l2_\msg\()_v.4s,key_v.4s
	add		l3_\tmp0\()_v.4s,l3_\msg\()_v.4s,key_v.4s
	mov		tmp2_v.16b,l2_abcd_v.16b
	mov		tmp3_v.16b,l3_abcd_v.16b
	sha256h		l2_abcd_q,l2_efgh_q,tmp0_v.4s
	sha256h		l3_abcd_q,l3_efgh_q,tmp1_v.4s
	sha256h2	l2_efgh_q,tmp2_q,tmp0_v.4s
	sha256h2	l3_efgh_q,tmp3_q,tmp1_v.4s


.endm
/**
maros for round 0-47
*/
.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req
	sha256su0		l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
	sha256su0		l1_\msg0\()_v.4s,l1_\msg1\()_v.4s
	sha256su0		l2_\msg0\()_v.4s,l2_\msg1\()_v.4s
	sha256su0		l3_\msg0\()_v.4s,l3_\msg1\()_v.4s
	sha256_4_rounds_high	\msg1,\tmp0
	sha256su1		l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
	sha256su1		l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
	sha256su1		l2_\msg0\()_v.4s,l2_\msg2\()_v.4s,l2_\msg3\()_v.4s
	sha256su1		l3_\msg0\()_v.4s,l3_\msg2\()_v.4s,l3_\msg3\()_v.4s
.endm


/*
Variable list
*/

	declare_var_vector_reg	key,15


/*
digest variables
*/
	declare_var_vector_reg	l0_abcd,0
	declare_var_vector_reg	l0_efgh,1
	declare_var_vector_reg	l1_abcd,2
	declare_var_vector_reg	l1_efgh,3
	declare_var_vector_reg	l2_abcd,4
	declare_var_vector_reg	l2_efgh,5
	declare_var_vector_reg	l3_abcd,6
	declare_var_vector_reg	l3_efgh,7
	declare_var_vector_reg	l1_abcd_saved,16
	declare_var_vector_reg	l1_efgh_saved,17
	declare_var_vector_reg	l0_abcd_saved,20
	declare_var_vector_reg	l0_efgh_saved,21
	declare_var_vector_reg	l2_abcd_saved,24
	declare_var_vector_reg	l2_efgh_saved,25
	declare_var_vector_reg	l3_abcd_saved,28
	declare_var_vector_reg	l3_efgh_saved,29
/*
Temporary variables
*/
	declare_var_vector_reg	l0_tmp0,8
	declare_var_vector_reg	l1_tmp0,9
	declare_var_vector_reg	l2_tmp0,10
	declare_var_vector_reg	l3_tmp0,11

	declare_var_vector_reg	tmp0,12
	declare_var_vector_reg	tmp1,13
	declare_var_vector_reg	tmp2,14
	declare_var_vector_reg	tmp3,15

/*
Message variables
*/
	declare_var_vector_reg	l0_msg0,16
	declare_var_vector_reg	l0_msg1,17
	declare_var_vector_reg	l0_msg2,18
	declare_var_vector_reg	l0_msg3,19
	declare_var_vector_reg	l1_msg0,20
	declare_var_vector_reg	l1_msg1,21
	declare_var_vector_reg	l1_msg2,22
	declare_var_vector_reg	l1_msg3,23
	declare_var_vector_reg	l2_msg0,24
	declare_var_vector_reg	l2_msg1,25
	declare_var_vector_reg	l2_msg2,26
	declare_var_vector_reg	l2_msg3,27
	declare_var_vector_reg	l3_msg0,28
	declare_var_vector_reg	l3_msg1,29
	declare_var_vector_reg	l3_msg2,30
	declare_var_vector_reg	l3_msg3,31



/*
	void sha256_mb_ce_x4(ISAL_SHA256_JOB *, ISAL_SHA256_JOB *, ISAL_SHA256_JOB *, ISAL_SHA256_JOB *, int);
*/
/*
Arguments list
*/
	l0_job 	.req	x0
	l1_job 	.req	x1
	l2_job 	.req	x2
	l3_job 	.req	x3
	len	.req	w4
	l0_data	.req	x5
	l1_data	.req	x6
	l2_data	.req	x7
	l3_data	.req	x8
	tmp	.req	x9
	.global	sha256_mb_ce_x4
	.type	sha256_mb_ce_x4, %function
sha256_mb_ce_x4:
	//push d8~d15
	stp 	d8,d9,[sp,-192]!
	stp 	d10,d11,[sp,16]
	stp 	d12,d13,[sp,32]
	stp 	d14,d15,[sp,48]
	ldr	l0_data, [l0_job]
	ldr	l0_abcd_q, [l0_job, 64]
	ldr	l0_efgh_q, [l0_job, 80]
	ldr	l1_data,   [l1_job]
	ldr	l1_abcd_q, [l1_job, 64]
	ldr	l1_efgh_q, [l1_job, 80]
	ldr	l2_data,   [l2_job]
	ldr	l2_abcd_q, [l2_job, 64]
	ldr	l2_efgh_q, [l2_job, 80]
	ldr	l3_data,   [l3_job]
	ldr	l3_abcd_q, [l3_job, 64]
	ldr	l3_efgh_q, [l3_job, 80]



start_loop:

	//load key addr
	adr	tmp, KEY
	//load msgs
	ld1	{l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
	ld1	{l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
	ld1	{l2_msg0_v.4s-l2_msg3_v.4s},[l2_data]
	ld1	{l3_msg0_v.4s-l3_msg3_v.4s},[l3_data]
	ldr	key_q,[tmp]
	add	tmp,tmp,16
	//adjust loop parameter
	add	l0_data,l0_data,64
	add	l1_data,l1_data,64
	add	l2_data,l2_data,64
	add	l3_data,l3_data,64
	sub	len, len, #1
	cmp	len, 0


	rev32	l0_msg0_v.16b,l0_msg0_v.16b
	rev32	l0_msg1_v.16b,l0_msg1_v.16b
	add	l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s
	rev32	l0_msg2_v.16b,l0_msg2_v.16b
	rev32	l0_msg3_v.16b,l0_msg3_v.16b

	rev32	l1_msg0_v.16b,l1_msg0_v.16b
	rev32	l1_msg1_v.16b,l1_msg1_v.16b
	add	l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s
	rev32	l1_msg2_v.16b,l1_msg2_v.16b
	rev32	l1_msg3_v.16b,l1_msg3_v.16b

	rev32	l2_msg0_v.16b,l2_msg0_v.16b
	rev32	l2_msg1_v.16b,l2_msg1_v.16b
	add	l2_tmp0_v.4s, l2_msg0_v.4s,key_v.4s
	rev32	l2_msg2_v.16b,l2_msg2_v.16b
	rev32	l2_msg3_v.16b,l2_msg3_v.16b

	rev32	l3_msg0_v.16b,l3_msg0_v.16b
	rev32	l3_msg1_v.16b,l3_msg1_v.16b
	add	l3_tmp0_v.4s, l3_msg0_v.4s,key_v.4s
	rev32	l3_msg2_v.16b,l3_msg2_v.16b
	rev32	l3_msg3_v.16b,l3_msg3_v.16b



	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0    /* rounds 0-3 */
	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp0
	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0
	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp0
	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0    /* rounds 16-19 */
	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp0
	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0
	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp0
	sha256_4_rounds_low	msg0,msg1,msg2,msg3,tmp0    /* rounds 32-35 */
	sha256_4_rounds_low	msg1,msg2,msg3,msg0,tmp0
	sha256_4_rounds_low	msg2,msg3,msg0,msg1,tmp0
	sha256_4_rounds_low	msg3,msg0,msg1,msg2,tmp0



	sha256_4_rounds_high	msg1,tmp0			/* rounds 48-51 */

	/* msg0 msg1 is free , share with digest regs */
	ldr	l0_abcd_saved_q, [l0_job, 64]
	ldr	l1_abcd_saved_q, [l1_job, 64]
	ldr	l2_abcd_saved_q, [l2_job, 64]
	ldr	l3_abcd_saved_q, [l3_job, 64]
	ldr	l0_efgh_saved_q, [l0_job, 80]
	ldr	l1_efgh_saved_q, [l1_job, 80]
	ldr	l2_efgh_saved_q, [l2_job, 80]
	ldr	l3_efgh_saved_q, [l3_job, 80]

	sha256_4_rounds_high	msg2,tmp0
	sha256_4_rounds_high	msg3,tmp0

	/* rounds 60-63 */
	mov		tmp2_v.16b,l0_abcd_v.16b
	sha256h		l0_abcd_q,l0_efgh_q,l0_tmp0_v.4s
	sha256h2	l0_efgh_q,tmp2_q,l0_tmp0_v.4s

	mov		tmp2_v.16b,l1_abcd_v.16b
	sha256h		l1_abcd_q,l1_efgh_q,l1_tmp0_v.4s
	sha256h2	l1_efgh_q,tmp2_q,l1_tmp0_v.4s

	mov		tmp2_v.16b,l2_abcd_v.16b
	sha256h		l2_abcd_q,l2_efgh_q,l2_tmp0_v.4s
	sha256h2	l2_efgh_q,tmp2_q,l2_tmp0_v.4s

	mov		tmp2_v.16b,l3_abcd_v.16b
	sha256h		l3_abcd_q,l3_efgh_q,l3_tmp0_v.4s
	sha256h2	l3_efgh_q,tmp2_q,l3_tmp0_v.4s

	/* combine state */
	add     l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
	add     l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
	add     l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s
	add     l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s
	add     l2_abcd_v.4s,l2_abcd_v.4s,l2_abcd_saved_v.4s
	add     l2_efgh_v.4s,l2_efgh_v.4s,l2_efgh_saved_v.4s
	add     l3_abcd_v.4s,l3_abcd_v.4s,l3_abcd_saved_v.4s
	add     l3_efgh_v.4s,l3_efgh_v.4s,l3_efgh_saved_v.4s

	str	l0_abcd_q,	[l0_job, 64]
	str	l0_efgh_q, 	[l0_job, 80]
	str	l1_abcd_q,	[l1_job, 64]
	str	l1_efgh_q, 	[l1_job, 80]
	str	l2_abcd_q,	[l2_job, 64]
	str	l2_efgh_q, 	[l2_job, 80]
	str	l3_abcd_q,	[l3_job, 64]
	str	l3_efgh_q, 	[l3_job, 80]

	bgt	start_loop


	ldp 	d10,d11,[sp,16]
	ldp 	d12,d13,[sp,32]
	ldp 	d14,d15,[sp,48]
	ldp     d8, d9, [sp], 192
	ret

	.size	sha256_mb_ce_x4, .-sha256_mb_ce_x4
	.section	.rol0_data.cst16,"aM",@progbits,16
	.align	4
KEY:
	.word 0x428A2F98
	.word 0x71374491
	.word 0xB5C0FBCF
	.word 0xE9B5DBA5
	.word 0x3956C25B
	.word 0x59F111F1
	.word 0x923F82A4
	.word 0xAB1C5ED5
	.word 0xD807AA98
	.word 0x12835B01
	.word 0x243185BE
	.word 0x550C7DC3
	.word 0x72BE5D74
	.word 0x80DEB1FE
	.word 0x9BDC06A7
	.word 0xC19BF174
	.word 0xE49B69C1
	.word 0xEFBE4786
	.word 0x0FC19DC6
	.word 0x240CA1CC
	.word 0x2DE92C6F
	.word 0x4A7484AA
	.word 0x5CB0A9DC
	.word 0x76F988DA
	.word 0x983E5152
	.word 0xA831C66D
	.word 0xB00327C8
	.word 0xBF597FC7
	.word 0xC6E00BF3
	.word 0xD5A79147
	.word 0x06CA6351
	.word 0x14292967
	.word 0x27B70A85
	.word 0x2E1B2138
	.word 0x4D2C6DFC
	.word 0x53380D13
	.word 0x650A7354
	.word 0x766A0ABB
	.word 0x81C2C92E
	.word 0x92722C85
	.word 0xA2BFE8A1
	.word 0xA81A664B
	.word 0xC24B8B70
	.word 0xC76C51A3
	.word 0xD192E819
	.word 0xD6990624
	.word 0xF40E3585
	.word 0x106AA070
	.word 0x19A4C116
	.word 0x1E376C08
	.word 0x2748774C
	.word 0x34B0BCB5
	.word 0x391C0CB3
	.word 0x4ED8AA4A
	.word 0x5B9CCA4F
	.word 0x682E6FF3
	.word 0x748F82EE
	.word 0x78A5636F
	.word 0x84C87814
	.word 0x8CC70208
	.word 0x90BEFFFA
	.word 0xA4506CEB
	.word 0xBEF9A3F7
	.word 0xC67178F2
